We need only 5 columns from the dataset. Some data exploration shows
that the break points columns are NA for the datasets from
1990 and before. So, those .csv files can be ignored for our
purposes.
matches = read_csv("data/atp/atp_matches_2000.csv", col_select = c(-winner_seed,-loser_seed))
## Rows: 3378 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): tourney_id, tourney_name, surface, tourney_level, winner_entry, wi...
## dbl (33): draw_size, tourney_date, match_num, winner_id, winner_ht, winner_a...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## import only the columns you need
# matches = read_csv("data/atp/atp_matches_1991.csv",
# col_types = cols_only(best_of = 'i', w_bpFaced = 'i', w_bpSaved = 'i', l_bpFaced = 'i', l_bpSaved = 'i'))
matches %>% glimpse
## Rows: 3,378
## Columns: 47
## $ tourney_id <chr> "2000-301", "2000-301", "2000-301", "2000-301", "20…
## $ tourney_name <chr> "Auckland", "Auckland", "Auckland", "Auckland", "Au…
## $ surface <chr> "Hard", "Hard", "Hard", "Hard", "Hard", "Hard", "Ha…
## $ draw_size <dbl> 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,…
## $ tourney_level <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "…
## $ tourney_date <dbl> 20000110, 20000110, 20000110, 20000110, 20000110, 2…
## $ match_num <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ winner_id <dbl> 103163, 102607, 103252, 103507, 102103, 102021, 101…
## $ winner_entry <chr> NA, "Q", NA, NA, "Q", NA, NA, NA, NA, "Q", NA, NA, …
## $ winner_name <chr> "Tommy Haas", "Juan Balcells", "Alberto Martin", "J…
## $ winner_hand <chr> "R", "R", "R", "R", "R", "R", "R", "R", "R", "R", "…
## $ winner_ht <dbl> 188, 190, 175, 183, 180, 175, 185, 180, 193, 193, 1…
## $ winner_ioc <chr> "GER", "ESP", "ESP", "ESP", "USA", "USA", "SWE", "S…
## $ winner_age <dbl> 21.77139, 24.55852, 21.39083, 19.90965, 27.38125, 2…
## $ loser_id <dbl> 101543, 102644, 102238, 103819, 102765, 101647, 103…
## $ loser_entry <chr> NA, NA, NA, NA, NA, NA, "WC", "Q", NA, NA, NA, "WC"…
## $ loser_name <chr> "Jeff Tarango", "Franco Squillari", "Alberto Berasa…
## $ loser_hand <chr> "L", "L", "R", "R", "R", "R", "R", "R", "L", "R", "…
## $ loser_ht <dbl> 180, 183, 173, 185, 185, 175, 185, 188, 193, 188, 1…
## $ loser_ioc <chr> "USA", "ARG", "ESP", "SUI", "FRA", "ZIM", "NZL", "U…
## $ loser_age <dbl> 31.13758, 24.38604, 26.53525, 18.42300, 23.77002, 3…
## $ score <chr> "7-5 4-6 7-5", "7-5 7-5", "6-3 6-1", "6-4 6-4", "0-…
## $ best_of <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ round <chr> "R32", "R32", "R32", "R32", "R32", "R32", "R32", "R…
## $ minutes <dbl> 108, 85, 56, 68, 115, 140, 84, 77, 90, 57, 144, 96,…
## $ w_ace <dbl> 18, 5, 0, 5, 1, 5, 4, 8, 5, 3, 1, 7, 3, 5, 16, 11, …
## $ w_df <dbl> 4, 3, 0, 1, 2, 3, 8, 2, 3, 1, 9, 3, 3, 1, 5, 1, 0, …
## $ w_svpt <dbl> 96, 76, 55, 53, 98, 101, 60, 69, 68, 41, 111, 65, 4…
## $ w_1stIn <dbl> 49, 52, 35, 28, 66, 59, 31, 29, 43, 28, 65, 44, 30,…
## $ w_1stWon <dbl> 39, 39, 25, 26, 39, 45, 28, 26, 30, 23, 45, 32, 23,…
## $ w_2ndWon <dbl> 28, 13, 12, 15, 14, 19, 14, 20, 17, 10, 22, 12, 12,…
## $ w_SvGms <dbl> 17, 12, 8, 10, 13, 17, 10, 10, 13, 8, 17, 11, 9, 8,…
## $ w_bpSaved <dbl> 3, 5, 1, 0, 6, 1, 2, 1, 4, 2, 9, 2, 2, 3, 2, 0, 2, …
## $ w_bpFaced <dbl> 5, 6, 1, 0, 11, 4, 3, 2, 7, 2, 13, 4, 3, 3, 4, 0, 2…
## $ l_ace <dbl> 7, 5, 0, 11, 8, 3, 1, 7, 11, 0, 2, 8, 2, 0, 14, 4, …
## $ l_df <dbl> 8, 10, 6, 2, 8, 3, 5, 3, 10, 3, 5, 4, 7, 5, 4, 0, 4…
## $ l_svpt <dbl> 106, 74, 56, 70, 92, 105, 69, 64, 87, 63, 110, 100,…
## $ l_1stIn <dbl> 55, 32, 33, 43, 46, 65, 45, 41, 43, 29, 55, 59, 32,…
## $ l_1stWon <dbl> 39, 25, 20, 29, 34, 41, 28, 24, 33, 21, 38, 38, 20,…
## $ l_2ndWon <dbl> 29, 18, 7, 14, 18, 21, 12, 13, 14, 11, 26, 17, 8, 1…
## $ l_SvGms <dbl> 17, 12, 8, 10, 12, 17, 10, 9, 14, 8, 17, 11, 10, 8,…
## $ l_bpSaved <dbl> 4, 3, 7, 6, 5, 2, 9, 3, 5, 5, 3, 14, 1, 7, 2, 2, 4,…
## $ l_bpFaced <dbl> 7, 6, 11, 8, 9, 5, 12, 6, 11, 9, 8, 18, 5, 11, 5, 5…
## $ winner_rank <dbl> 11, 211, 48, 45, 167, 50, 60, 43, 37, 115, 72, 38, …
## $ winner_rank_points <dbl> 1612, 157, 726, 768, 219, 722, 626, 785, 843, 344, …
## $ loser_rank <dbl> 63, 49, 59, 61, 34, 70, 246, 334, 68, 95, 52, 76, 5…
## $ loser_rank_points <dbl> 595, 723, 649, 616, 873, 563, 135, 88, 571, 416, 71…
#calculate total successful break points in match
matches = matches %>% mutate(
total_breaks = (w_bpFaced-w_bpSaved) + (l_bpFaced-l_bpSaved)
)
matches %>% select(best_of, total_breaks) %>%
mutate(breaks_per_set = total_breaks/best_of) %>%
filter(best_of == 5) %>%
summarise(avg_breaks = mean(breaks_per_set, na.rm = TRUE))
## # A tibble: 1 × 1
## avg_breaks
## <dbl>
## 1 1.55
After dropping NA’s, we are left with ~1 million
observations (ATP) and ~35k observations (WTA).
Help taken from: # see: https://stackoverflow.com/questions/5788117/only-read-selected-columns, # and: https://stackoverflow.com/questions/41355912/extracting-a-numeric-value-from-multiple-filename-in-r
# ----------- read in all ATP match csv files -----------
filenames = list.files(path = "./data/atp/", pattern = "atp_matches*", full.names = TRUE)
combo_data_ATP = map_df(filenames,
~read_csv(.x,
col_types = cols_only(best_of = 'i', w_bpFaced = 'i', w_bpSaved = 'i', l_bpFaced = 'i', l_bpSaved = 'i', surface = "c"),
show_col_types = FALSE)
%>% mutate(year = as.integer(stri_extract_last(.x, regex = "(\\d+)")))
) %>%
na.omit()
# ----------- calculate stats -----------
breaks_stats_ATP = combo_data_ATP %>% mutate(
total_breaks = (w_bpFaced-w_bpSaved) + (l_bpFaced-l_bpSaved)) %>%
mutate(breaks_per_set = total_breaks/best_of) %>%
group_by(year, surface) %>%
summarise(
avg_breaks_surface = mean(breaks_per_set, na.rm = TRUE)
)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
breaks_per_year_ATP = breaks_stats_ATP %>% summarise(avg_breaks = mean(avg_breaks_surface))
# ----------- read in all WTA match csv files -----------
filenames = list.files(path = "./data/wta/", pattern = "wta_matches*", full.names = TRUE)
combo_data_WTA = map_df(filenames,
~read_csv(.x,
col_types = cols_only(best_of = 'i', w_bpFaced = 'i', w_bpSaved = 'i', l_bpFaced = 'i', l_bpSaved = 'i', surface = "c"),
show_col_types = FALSE)
%>% mutate(year = as.integer(stri_extract_last(.x, regex = "(\\d+)")))
) %>%
na.omit()
# ----------- calculate stats -----------
breaks_stats_WTA = combo_data_WTA %>% mutate(
total_breaks = (w_bpFaced-w_bpSaved) + (l_bpFaced-l_bpSaved)) %>%
mutate(breaks_per_set = total_breaks/best_of) %>%
group_by(year, surface) %>%
summarise(
avg_breaks_surface = mean(breaks_per_set, na.rm = TRUE)
)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
breaks_per_year_WTA = breaks_stats_WTA %>%
summarise(avg_breaks = mean(avg_breaks_surface))
breaks_per_year_WTA
## # A tibble: 20 × 2
## year avg_breaks
## <int> <dbl>
## 1 2003 2.46
## 2 2004 2.58
## 3 2005 2.53
## 4 2006 2.51
## 5 2007 2.40
## 6 2008 2.53
## # … with 14 more rows
# Plot 1
# ------- Combine breaks_per_year datasets for men and women, and plot ----
bind_rows(breaks_per_year_ATP, breaks_per_year_WTA, .id = "id") %>%
mutate(id=recode(id, '1'='ATP', '2'='WTA')) %>%
ggplot(aes(x=year, y=avg_breaks, colour=id)) +
geom_point() +
geom_hline(yintercept=mean(breaks_per_year_ATP$avg_breaks), colour="red") +
geom_hline(yintercept=mean(breaks_per_year_WTA$avg_breaks), colour="blue") +
labs(
title = "How common are service breaks? (1991-current)",
y = "Breaks per set",
x = "Time"
)
# Plot 2
# ------- Combine combo_data_WTA datasets for men and women, and plot ------
# NB: This has additional `playing surface` information
bind_rows(breaks_stats_ATP, breaks_stats_WTA, .id = "id") %>%
mutate(id=recode(id, '1'='ATP', '2'='WTA')) %>%
ggplot(aes(x=year, y=avg_breaks_surface, colour=id)) +
geom_point() +
facet_wrap(vars(surface)) +
labs(
title = "How common are service breaks? (1991-current)",
subtitle = "Dependence on playing surface",
y = "Breaks per set",
x = "Time"
)
breaks_stats_WTA
## # A tibble: 68 × 3
## # Groups: year [20]
## year surface avg_breaks_surface
## <int> <chr> <dbl>
## 1 2003 Clay 2.71
## 2 2003 Grass 2.21
## 3 2003 Hard 2.45
## 4 2004 Clay 2.91
## 5 2004 Grass 2.19
## 6 2004 Hard 2.63
## # … with 62 more rows